# Install packages
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("randomForest")
##
## The downloaded binary packages are in
## /var/folders/7n/r8t0b1p105jcq269y7gkfjcc0000gn/T//Rtmp6KsbPA/downloaded_packages
install.packages("caret")
##
## The downloaded binary packages are in
## /var/folders/7n/r8t0b1p105jcq269y7gkfjcc0000gn/T//Rtmp6KsbPA/downloaded_packages
install.packages("xgboost")
##
## The downloaded binary packages are in
## /var/folders/7n/r8t0b1p105jcq269y7gkfjcc0000gn/T//Rtmp6KsbPA/downloaded_packages
install.packages("Metrics")
##
## The downloaded binary packages are in
## /var/folders/7n/r8t0b1p105jcq269y7gkfjcc0000gn/T//Rtmp6KsbPA/downloaded_packages
library(xgboost)
library(randomForest)
library(janitor)
library(tidyverse)
library(dplyr)
library(mice)
library(naniar)
library(Metrics)
library(rsample)
# Read csv file "college_data.csv", and import it as college_df
college_df <- read.csv("college_data.csv")
# Remove all unweighted columns and redundant tier name columns
college_df <- college_df %>%
select(-matches("unwgt")) %>%
select(-c("tier_name", "test_band_tier", "flagship"))
# Create new df for public schools
public_df <- college_df %>%
filter(public == "Public")
# Create new df for private schools and remove all unnecessary columns that are not applicable to private schools.
private_df <- college_df %>%
filter(public == "Private") %>%
select(-matches("oostate|instate" ))
# Visualize missing values
vis_miss(private_df)
vis_miss(public_df)
# Remove rows with more than 4 missing values for both data sets
private_df <- private_df %>%
filter(rowSums(is.na(.)) < 4)
public_df <- public_df %>%
filter(rowSums(is.na(.)) < 4)
# Utilize MICE method to impute other missing values
imputed_values <- mice(data = private_df, # Created imputed values
m = 1, # Set # of imputations
maxit = 40, # Set max # iterations
method = "cart", # Method as "cart"
print = FALSE)
## Warning: Number of logged events: 84
private_df<- complete(imputed_values, 1)
imputed_values_public <- mice(data = public_df, # Created imputed values
m = 1, # Set number of imputations
maxit = 40, # Set max # iterations
method = "cart", # Method as "cart"
print = FALSE)
## Warning: Number of logged events: 244
public_df<- complete(imputed_values_public, 1)
# Visualize the missing values again to see that there are no more missing values.
vis_miss(private_df)
vis_miss(public_df)
# Remove the public column, as now we have separated the two data frames, there is no need for this column.
public_df <- public_df %>%
select(-public)
private_df <- private_df %>%
select(-public)
# Make tier and name columns into factors
public_df <- public_df %>%
mutate(
tier = as.factor(tier),
name = as.factor(name)
)
set.seed(111111)
# Split private_df into training and testing sets by 80/20 ratio.
split <- initial_split(private_df, prop = 0.8)
# Training set
private_train_df <- training(split)
# Test set
private_test_df <- testing(split)
# View dimensions of the different private and test set
cat("Training data size: ", nrow(private_train_df), "\n")
## Training data size: 844
cat("Test data size: ", nrow(private_test_df), "\n")
## Test data size: 212
# Repeat same process for public school df
set.seed(111111)
split <- initial_split(public_df, prop = 0.8)
public_train_df <- training(split)
public_test_df <- testing(split)
cat("Training data size: ", nrow(public_train_df), "\n")
## Training data size: 489
cat("Test data size: ", nrow(public_test_df), "\n")
## Test data size: 123
# Linear regression madel, don't include columns name and super_opeid. These are not relevant for a lm
public_fit <- lm(rel_apply ~ . - name - super_opeid, data = public_df)
summary(public_fit)
##
## Call:
## lm(formula = rel_apply ~ . - name - super_opeid, data = public_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.138679 -0.013715 -0.000181 0.013131 0.123546
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) -3.996e-03 3.743e-02 -0.107
## par_income_bin -6.555e-06 1.425e-04 -0.046
## par_income_lab20-40 3.382e-03 7.440e-03 0.455
## par_income_lab40-60 1.367e-03 1.099e-02 0.124
## par_income_lab60-70 6.745e-03 1.155e-02 0.584
## par_income_lab70-80 4.947e-03 1.367e-02 0.362
## par_income_lab80-90 -4.204e-03 1.594e-02 -0.264
## par_income_lab90-95 -1.583e-02 1.506e-02 -1.051
## par_income_lab95-96 -5.276e-03 7.247e-03 -0.728
## par_income_lab96-97 -8.968e-03 7.030e-03 -1.276
## par_income_lab97-98 -8.733e-03 6.904e-03 -1.265
## par_income_lab98-99 -1.205e-02 6.313e-03 -1.909
## par_income_labTop 1 NA NA NA
## attend 5.192e-01 2.904e+00 0.179
## stderr_attend 4.130e+01 7.538e+01 0.548
## attend_level -1.181e+00 3.027e+00 -0.390
## attend_sat 7.443e+00 3.148e+00 2.365
## stderr_attend_sat -8.530e+01 5.020e+01 -1.699
## attend_level_sat -3.347e+00 3.159e+00 -1.059
## stderr_rel_apply 5.670e+00 9.934e-01 5.707
## rel_attend 6.424e-01 2.273e-02 28.265
## stderr_rel_attend -6.919e-02 3.536e-01 -0.196
## rel_att_cond_app -6.439e-01 3.325e-02 -19.368
## rel_apply_sat 4.703e-01 2.074e-02 22.678
## stderr_rel_apply_sat -2.801e+00 4.575e-01 -6.124
## rel_attend_sat -2.065e-01 1.781e-02 -11.596
## stderr_rel_attend_sat 7.218e-01 1.172e-01 6.158
## rel_att_cond_app_sat 1.300e-01 1.261e-02 10.305
## attend_instate -9.824e-02 1.300e-01 -0.756
## stderr_attend_instate -3.215e+00 1.029e+00 -3.125
## attend_level_instate 2.092e-01 1.424e-01 1.469
## attend_instate_sat -7.028e-02 1.057e-01 -0.665
## stderr_attend_instate_sat 1.476e+00 4.906e-01 3.009
## attend_level_instate_sat -3.073e-02 1.028e-01 -0.299
## attend_oostate 3.972e+00 3.960e+00 1.003
## stderr_attend_oostate -9.077e+01 1.057e+02 -0.859
## attend_level_oostate -7.199e+00 6.925e+00 -1.039
## attend_oostate_sat -1.076e+01 4.259e+00 -2.527
## stderr_attend_oostate_sat 9.686e+01 5.894e+01 1.643
## attend_level_oostate_sat 1.641e+01 8.694e+00 1.887
## rel_apply_instate 6.641e-01 5.429e-02 12.233
## stderr_rel_apply_instate -1.006e+00 5.185e-01 -1.939
## rel_attend_instate -4.963e-01 5.653e-02 -8.779
## stderr_rel_attend_instate -7.404e-02 2.606e-01 -0.284
## rel_att_cond_app_instate 4.853e-01 5.367e-02 9.042
## rel_apply_oostate 2.231e-01 2.058e-02 10.841
## stderr_rel_apply_oostate -1.122e+00 2.801e-01 -4.003
## rel_attend_oostate -1.012e-01 1.172e-02 -8.636
## stderr_rel_attend_oostate 3.099e-02 3.991e-02 0.777
## rel_att_cond_app_oostate 1.108e-01 1.604e-02 6.908
## rel_apply_instate_sat -2.043e-01 3.403e-02 -6.003
## stderr_rel_apply_instate_sat 6.042e-01 2.183e-01 2.768
## rel_attend_instate_sat 1.012e-01 3.002e-02 3.371
## stderr_rel_attend_instate_sat -3.212e-01 1.025e-01 -3.133
## rel_att_cond_app_instate_sat -6.488e-02 2.592e-02 -2.503
## rel_apply_oostate_sat -1.217e-01 1.602e-02 -7.596
## stderr_rel_apply_oostate_sat 4.202e-01 1.286e-01 3.269
## rel_attend_oostate_sat 2.515e-02 6.695e-03 3.756
## stderr_rel_attend_oostate_sat -4.261e-02 1.603e-02 -2.659
## rel_att_cond_app_oostate_sat -1.084e-02 8.971e-03 -1.208
## tierOther elite schools (public and private) -1.121e-02 5.084e-03 -2.205
## tierSelective public 3.872e-04 3.648e-03 0.106
## Pr(>|t|)
## (Intercept) 0.915023
## par_income_bin 0.963327
## par_income_lab20-40 0.649630
## par_income_lab40-60 0.901013
## par_income_lab60-70 0.559404
## par_income_lab70-80 0.717517
## par_income_lab80-90 0.792123
## par_income_lab90-95 0.293564
## par_income_lab95-96 0.466879
## par_income_lab96-97 0.202645
## par_income_lab97-98 0.206454
## par_income_lab98-99 0.056768 .
## par_income_labTop 1 NA
## attend 0.858144
## stderr_attend 0.584035
## attend_level 0.696537
## attend_sat 0.018397 *
## stderr_attend_sat 0.089868 .
## attend_level_sat 0.289849
## stderr_rel_apply 1.88e-08 ***
## rel_attend < 2e-16 ***
## stderr_rel_attend 0.844939
## rel_att_cond_app < 2e-16 ***
## rel_apply_sat < 2e-16 ***
## stderr_rel_apply_sat 1.74e-09 ***
## rel_attend_sat < 2e-16 ***
## stderr_rel_attend_sat 1.42e-09 ***
## rel_att_cond_app_sat < 2e-16 ***
## attend_instate 0.450140
## stderr_attend_instate 0.001872 **
## attend_level_instate 0.142489
## attend_instate_sat 0.506255
## stderr_attend_instate_sat 0.002738 **
## attend_level_instate_sat 0.765205
## attend_oostate 0.316257
## stderr_attend_oostate 0.390661
## attend_level_oostate 0.299037
## attend_oostate_sat 0.011770 *
## stderr_attend_oostate_sat 0.100906
## attend_level_oostate_sat 0.059654 .
## rel_apply_instate < 2e-16 ***
## stderr_rel_apply_instate 0.052960 .
## rel_attend_instate < 2e-16 ***
## stderr_rel_attend_instate 0.776436
## rel_att_cond_app_instate < 2e-16 ***
## rel_apply_oostate < 2e-16 ***
## stderr_rel_apply_oostate 7.10e-05 ***
## rel_attend_oostate < 2e-16 ***
## stderr_rel_attend_oostate 0.437758
## rel_att_cond_app_oostate 1.36e-11 ***
## rel_apply_instate_sat 3.51e-09 ***
## stderr_rel_apply_instate_sat 0.005832 **
## rel_attend_instate_sat 0.000802 ***
## stderr_rel_attend_instate_sat 0.001825 **
## rel_att_cond_app_instate_sat 0.012586 *
## rel_apply_oostate_sat 1.32e-13 ***
## stderr_rel_apply_oostate_sat 0.001148 **
## rel_attend_oostate_sat 0.000191 ***
## stderr_rel_attend_oostate_sat 0.008070 **
## rel_att_cond_app_oostate_sat 0.227521
## tierOther elite schools (public and private) 0.027844 *
## tierSelective public 0.915504
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02869 on 551 degrees of freedom
## Multiple R-squared: 0.9938, Adjusted R-squared: 0.9932
## F-statistic: 1483 on 60 and 551 DF, p-value: < 2.2e-16
# Repeat for private schools
private_fit <- lm(rel_apply ~ . - name - super_opeid, data = private_df)
summary(private_fit)
##
## Call:
## lm(formula = rel_apply ~ . - name - super_opeid, data = private_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49347 -0.02754 -0.00059 0.02724 0.26238
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 3.844e-01 2.198e-02 17.487
## par_income_bin 5.362e-04 1.490e-04 3.599
## par_income_lab20-40 2.345e-02 8.626e-03 2.719
## par_income_lab40-60 3.882e-02 9.440e-03 4.113
## par_income_lab60-70 3.266e-02 1.006e-02 3.247
## par_income_lab70-80 3.806e-02 1.082e-02 3.519
## par_income_lab80-90 3.999e-02 1.149e-02 3.479
## par_income_lab90-95 3.481e-02 1.140e-02 3.053
## par_income_lab95-96 2.475e-02 1.094e-02 2.263
## par_income_lab96-97 4.308e-02 1.064e-02 4.050
## par_income_lab97-98 3.408e-02 1.010e-02 3.375
## par_income_lab98-99 3.685e-02 9.521e-03 3.871
## par_income_labTop 1 NA NA NA
## attend 7.277e+00 2.441e+00 2.981
## stderr_attend -5.889e+01 1.825e+01 -3.227
## attend_level -4.902e+00 2.709e+00 -1.809
## attend_sat -4.764e+00 1.760e+00 -2.707
## stderr_attend_sat 6.850e+01 1.497e+01 4.577
## attend_level_sat 1.561e+00 2.204e+00 0.708
## stderr_rel_apply 3.358e+00 4.402e-01 7.629
## rel_attend 4.414e-01 9.998e-03 44.146
## stderr_rel_attend -3.286e-03 1.318e-01 -0.025
## rel_att_cond_app -4.522e-01 1.935e-02 -23.369
## rel_apply_sat 5.281e-01 1.550e-02 34.078
## stderr_rel_apply_sat -1.290e+00 1.663e-01 -7.757
## rel_attend_sat -1.336e-01 1.021e-02 -13.082
## stderr_rel_attend_sat 3.002e-04 3.929e-02 0.008
## rel_att_cond_app_sat 1.179e-01 1.125e-02 10.474
## tierIvy Plus -1.888e-02 1.243e-02 -1.519
## tierOther elite schools (public and private) -9.060e-03 6.045e-03 -1.499
## tierSelective private 1.507e-02 1.071e-02 1.407
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## par_income_bin 0.000334 ***
## par_income_lab20-40 0.006660 **
## par_income_lab40-60 4.22e-05 ***
## par_income_lab60-70 0.001202 **
## par_income_lab70-80 0.000452 ***
## par_income_lab80-90 0.000524 ***
## par_income_lab90-95 0.002321 **
## par_income_lab95-96 0.023851 *
## par_income_lab96-97 5.51e-05 ***
## par_income_lab97-98 0.000767 ***
## par_income_lab98-99 0.000115 ***
## par_income_labTop 1 NA
## attend 0.002944 **
## stderr_attend 0.001292 **
## attend_level 0.070673 .
## attend_sat 0.006906 **
## stderr_attend_sat 5.29e-06 ***
## attend_level_sat 0.478883
## stderr_rel_apply 5.39e-14 ***
## rel_attend < 2e-16 ***
## stderr_rel_attend 0.980113
## rel_att_cond_app < 2e-16 ***
## rel_apply_sat < 2e-16 ***
## stderr_rel_apply_sat 2.10e-14 ***
## rel_attend_sat < 2e-16 ***
## stderr_rel_attend_sat 0.993905
## rel_att_cond_app_sat < 2e-16 ***
## tierIvy Plus 0.129073
## tierOther elite schools (public and private) 0.134257
## tierSelective private 0.159835
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06017 on 1026 degrees of freedom
## Multiple R-squared: 0.9837, Adjusted R-squared: 0.9833
## F-statistic: 2138 on 29 and 1026 DF, p-value: < 2.2e-16
# Bagging Model
set.seed(111111)
bag_mod_priv <- randomForest(rel_apply ~.,
data = private_train_df, # Select df.
mtry = 23, # Set mtry to # of variables
ntree = 200) # Number of trees wanted
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
bag_mod_priv
##
## Call:
## randomForest(formula = rel_apply ~ ., data = private_train_df, mtry = 23, ntree = 200)
## Type of random forest: regression
## Number of trees: 200
## No. of variables tried at each split: 20
##
## Mean of squared residuals: 0.008951499
## % Var explained: 95.75
bag_preds_priv <- predict(bag_mod_priv, private_test_df) # Create the predictions from the model
rmse <- sqrt(mean((bag_preds_priv - private_test_df$rel_apply)^2)) # See the RMSE from model
print(paste("RMSE:", rmse))
## [1] "RMSE: 0.100328598409104"
# Repeat for public schools
set.seed(111111)
bag_mod_pub <- randomForest(rel_apply ~.,
data = public_train_df,
mtry = 53,
ntree = 200)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
bag_mod_pub
##
## Call:
## randomForest(formula = rel_apply ~ ., data = public_train_df, mtry = 53, ntree = 200)
## Type of random forest: regression
## Number of trees: 200
## No. of variables tried at each split: 52
##
## Mean of squared residuals: 0.008252995
## % Var explained: 92.92
bag_preds_pub <- predict(bag_mod_pub, public_test_df) # Create predictions from this bagging model
rmse <- sqrt(mean((bag_preds_pub - public_test_df$rel_apply)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 0.113082129428566"
# Preparation for XG Boost
# Remove the name and par_income_lab column (this one is redundant with par_income_bin), and make sure that "par_income_bin" is of numeric type
xg_private_train_df <- private_train_df[ , !colnames(private_train_df) %in% c("name", "par_income_lab")]
xg_private_test_df <- private_test_df[ , !colnames(private_test_df) %in% c("name", "par_income_lab")]
xg_public_train_df <- public_train_df[ , !colnames(public_train_df) %in% c("name", "par_income_lab")]
xg_public_test_df <- public_test_df[ , !colnames(public_test_df) %in% c("name", "par_income_lab")]
# Convert 'par_income_bin' to numeric in all dfs
xg_private_train_df$par_income_bin <- as.numeric(xg_private_train_df$par_income_bin)
xg_private_test_df$par_income_bin <- as.numeric(xg_private_test_df$par_income_bin)
xg_public_train_df$par_income_bin <- as.numeric(xg_public_train_df$par_income_bin)
xg_public_test_df$par_income_bin <- as.numeric(xg_public_test_df$par_income_bin)
# Double check that it is numeric.
str(xg_private_train_df$par_income_bin)
## num [1:844] 10 98.5 30 85 99.5 92.5 92.5 75 50 10 ...
str(xg_private_test_df$par_income_bin)
## num [1:212] 30 96.5 97.5 50 75 50 75 97.5 75 92.5 ...
str(xg_public_train_df$par_income_bin)
## num [1:489] 98.5 85 10 30 85 98.5 92.5 99.5 75 92.5 ...
str(xg_public_test_df$par_income_bin)
## num [1:123] 95.5 96.5 97.5 50 30 50 95.5 75 65 95.5 ...
# See levels of tier
print(unique(xg_private_train_df$tier))
## [1] "Highly selective private"
## [2] "Other elite schools (public and private)"
## [3] "Ivy Plus"
## [4] "Selective private"
print(unique(xg_public_train_df$tier))
## [1] Selective public
## [2] Highly selective public
## [3] Other elite schools (public and private)
## 3 Levels: Highly selective public ... Selective public
# Set tier levels
xg_private_train_df$tier <- as.numeric(factor(xg_private_train_df$tier,
levels = c("Selective private", "Highly selective private", "Other elite schools (public and private)", "Ivy Plus")))
xg_private_test_df$tier <- as.numeric(factor(xg_private_test_df$tier,
levels = c("Selective private", "Highly selective private", "Other elite schools (public and private)", "Ivy Plus")))
# Same for public schools
xg_public_train_df$tier <- as.numeric(factor(xg_public_train_df$tier,
levels = c("Selective public", "Highly selective public", "Other elite schools (public and private)")))
xg_public_test_df$tier <- as.numeric(factor(xg_public_test_df$tier,
levels = c("Selective public", "Highly selective public", "Other elite schools (public and private)")))
# Set matrices for xgboost analysis for private schools
dtrain_private <- xgb.DMatrix(data = as.matrix(xg_private_train_df[,c(2:8, 10:19)]), label = as.numeric(xg_private_train_df$rel_apply) -1)
dtest_private <- xgb.DMatrix(data = as.matrix(xg_private_test_df[,c(2:8, 10:19)]), label = as.numeric(xg_private_test_df$rel_apply) - 1)
# Set matrices for xgboost analysis for public schools
dtrain_public <- xgb.DMatrix(data = as.matrix(xg_public_train_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_train_df$rel_apply) -1)
dtest_public <- xgb.DMatrix(data = as.matrix(xg_public_test_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_test_df$rel_apply) - 1)
# Use xgb.cv for cross validation analysis in xgb
set.seed(111111)
private_bst <- xgb.cv(data = dtrain_private, # Training data
nfold = 5, # 5 fold cv
eta = 0.1, # Learning rate
nrounds = 100, # 100 rounds
early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
verbose = 1, # Print out fit
nthread = 1, # Number of parallel threads
print_every_n = 20, # Results every 20th iter
objective = "reg:squarederror", # Make this into regression
eval_metric = "rmse") # See RMSE
## [1] train-rmse:0.551040+0.005750 test-rmse:0.552299+0.024995
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.087554+0.001209 test-rmse:0.123863+0.025436
## [41] train-rmse:0.025785+0.000375 test-rmse:0.087308+0.020835
## [61] train-rmse:0.014847+0.000366 test-rmse:0.083222+0.018624
## [81] train-rmse:0.010022+0.000500 test-rmse:0.081936+0.018388
## [100] train-rmse:0.007236+0.000482 test-rmse:0.081302+0.018247
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.3
set.seed(111111)
private_bst_mod_1 <- xgb.cv(data = dtrain_private, # Training data
nfold = 5, # 5 fold cv
eta = 0.3, # Learning rate as 0.3
nrounds = 100, # 100 rounds
early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
verbose = 1, # Print out fit
nthread = 1, # Number of parallel threads
print_every_n = 20, # Results every 20th iter
objective = "reg:squarederror", # Make this into regression
eval_metric = "rmse") # See RMSE
## [1] train-rmse:0.436080+0.004902 test-rmse:0.441538+0.026091
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.015118+0.000486 test-rmse:0.084970+0.019505
## [41] train-rmse:0.005549+0.000403 test-rmse:0.082750+0.019216
## [61] train-rmse:0.002339+0.000177 test-rmse:0.082478+0.019133
## [81] train-rmse:0.001116+0.000133 test-rmse:0.082346+0.019166
## [100] train-rmse:0.000871+0.000036 test-rmse:0.082332+0.019176
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.1
set.seed(111111)
private_bst_mod_2 <- xgb.cv(data = dtrain_private, # Training data
nfold = 5, # 5 fold cv
eta = 0.1, # Learning rate as 0.1
nrounds = 100, # 100 rounds
early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
verbose = 1, # Print out fit
nthread = 1, # Number of parallel threads
print_every_n = 20, # Results every 20th iter
objective = "reg:squarederror", # Make this into regression
eval_metric = "rmse") # See RMSE
## [1] train-rmse:0.551040+0.005750 test-rmse:0.552299+0.024995
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.087554+0.001209 test-rmse:0.123863+0.025436
## [41] train-rmse:0.025785+0.000375 test-rmse:0.087308+0.020835
## [61] train-rmse:0.014847+0.000366 test-rmse:0.083222+0.018624
## [81] train-rmse:0.010022+0.000500 test-rmse:0.081936+0.018388
## [100] train-rmse:0.007236+0.000482 test-rmse:0.081302+0.018247
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.05
set.seed(111111)
private_bst_mod_3 <- xgb.cv(data = dtrain_private, # Training data
nfold = 5, # 5 fold cv
eta = 0.05, # Learning rate as 0.05
nrounds = 100, # 100 rounds
early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
verbose = 1, # Print out fit
nthread = 1, # Number of parallel threads
print_every_n = 20, # Results every 20th iter
objective = "reg:squarederror", # Make this into regression
eval_metric = "rmse") # See RMSE
## [1] train-rmse:0.579931+0.005976 test-rmse:0.580308+0.024906
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.225495+0.002840 test-rmse:0.241696+0.025724
## [41] train-rmse:0.094727+0.001395 test-rmse:0.129109+0.025518
## [61] train-rmse:0.045555+0.000584 test-rmse:0.097395+0.024107
## [81] train-rmse:0.026917+0.000420 test-rmse:0.087689+0.021002
## [100] train-rmse:0.019629+0.000359 test-rmse:0.084678+0.019389
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.01
set.seed(111111)
private_bst_mod_4 <- xgb.cv(data = dtrain_private, # Training data
nfold = 5, # 5 fold cv
eta = 0.01, # Learning rate as 0.01
nrounds = 100, # 100 rounds
early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
verbose = 1, # Print out fit
nthread = 1, # Number of parallel threads
print_every_n = 20, # Results every 20th iter
objective = "reg:squarederror", # Make this into regression
eval_metric = "rmse") # See RMSE
## [1] train-rmse:0.603075+0.006160 test-rmse:0.602782+0.024885
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.498657+0.005325 test-rmse:0.501702+0.025309
## [41] train-rmse:0.413151+0.004626 test-rmse:0.419514+0.026152
## [61] train-rmse:0.343005+0.003978 test-rmse:0.352274+0.026476
## [81] train-rmse:0.285430+0.003433 test-rmse:0.298185+0.026535
## [100] train-rmse:0.240257+0.002995 test-rmse:0.256018+0.026229
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.005
set.seed(111111)
private_bst_mod_5 <- xgb.cv(data = dtrain_private, # Training data
nfold = 5, # 5 fold cv
eta = 0.005, # Learning rate as 0.005
nrounds = 100, # 100 rounds
early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
verbose = 1, # Print out fit
nthread = 1, # Number of parallel threads
print_every_n = 20, # Results every 20th iter
objective = "reg:squarederror", # Make this into regression
eval_metric = "rmse") # See RMSE
## [1] train-rmse:0.605970+0.006183 test-rmse:0.605595+0.024885
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.551000+0.005734 test-rmse:0.552274+0.025044
## [41] train-rmse:0.501256+0.005347 test-rmse:0.504200+0.025266
## [61] train-rmse:0.456236+0.004991 test-rmse:0.460913+0.025728
## [81] train-rmse:0.415473+0.004649 test-rmse:0.421781+0.026121
## [100] train-rmse:0.380292+0.004331 test-rmse:0.387905+0.026234
# Extract results for model with eta = 0.3
pd1 <- cbind.data.frame(private_bst_mod_1$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.3, nrow(private_bst_mod_1$evaluation_log)))
names(pd1)[3] <- "eta"
# Extract results for model with eta = 0.1
pd2 <- cbind.data.frame(private_bst_mod_2$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.1, nrow(private_bst_mod_2$evaluation_log)))
names(pd2)[3] <- "eta"
# Extract results for model with eta = 0.05
pd3 <- cbind.data.frame(private_bst_mod_3$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.05, nrow(private_bst_mod_3$evaluation_log)))
names(pd3)[3] <- "eta"
# Extract results for model with eta = 0.01
pd4 <- cbind.data.frame(private_bst_mod_4$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.01, nrow(private_bst_mod_4$evaluation_log)))
names(pd4)[3] <- "eta"
# Extract results for model with eta = 0.005
pd5 <- cbind.data.frame(private_bst_mod_5$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.005, nrow(private_bst_mod_5$evaluation_log)))
names(pd5)[3] <- "eta"
# Join datasets
plot_data <- rbind.data.frame(pd1, pd2, pd3, pd4, pd5)
# Converty ETA to factor
plot_data$eta <- as.factor(plot_data$eta)
# Plot points
g_1 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
geom_point(alpha = 0.5) +
theme_bw() + # Set theme
theme(panel.grid.major = element_blank(), # Remove grid
panel.grid.minor = element_blank(), # Remove grid
panel.border = element_blank(), # Remove grid
panel.background = element_blank()) + # Remove grid
labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
y = "Error Rate", color = "Learning \n Rate") # Set labels
g_1
# Plot lines
g_2 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
geom_smooth(alpha = 0.5) +
theme_bw() + # Set theme
theme(panel.grid.major = element_blank(), # Remove grid
panel.grid.minor = element_blank(), # Remove grid
panel.border = element_blank(), # Remove grid
panel.background = element_blank()) + # Remove grid
labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
y = "Error Rate", color = "Learning \n Rate")
g_2
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Plot lines
g_3 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
geom_smooth(alpha = 0.5) +
theme_bw() + # Set theme
theme(panel.grid.major = element_blank(), # Remove grid
panel.grid.minor = element_blank(), # Remove grid
panel.border = element_blank(), # Remove grid
panel.background = element_blank()) + # Remove grid
labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
y = "Error Rate", color = "Learning \n Rate") + # Set labels
xlim(0, 100) + # Zoom in on the x-axis (number of trees)
ylim(0, 0.1) # Zoom in on the y-axis (error rate)
g_3
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 293 rows containing non-finite values (`stat_smooth()`).
private_bst_mod_final <- xgboost(data = dtrain_private, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [23:52:05] WARNING: src/learner.cc:767:
## Parameters: { "nfold" } are not used.
##
## [1] train-rmse:0.550939
## Will train until train_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.086868
## [41] train-rmse:0.025991
## [61] train-rmse:0.015625
## [81] train-rmse:0.011242
## [100] train-rmse:0.008119
# Set matrices for xgboost analysis for private schools
dtrain_private <- xgb.DMatrix(data = as.matrix(xg_private_train_df[,c(2:8, 10:19)]), label = as.numeric(xg_private_train_df$rel_apply) -1)
dtest_private <- xgb.DMatrix(data = as.matrix(xg_private_test_df[,c(2:8, 10:19)]), label = as.numeric(xg_private_test_df$rel_apply) - 1)
private_bst_mod_final <- xgboost(data = dtrain_private, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [23:52:06] WARNING: src/learner.cc:767:
## Parameters: { "nfold" } are not used.
##
## [1] train-rmse:0.550939
## Will train until train_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.086868
## [41] train-rmse:0.025991
## [61] train-rmse:0.015625
## [81] train-rmse:0.011242
## [100] train-rmse:0.008119
# XGBoost for private schools
set.seed(111111)
private_bst_1 <- xgboost(data = dtrain_private, # Training data
nrounds = 100, # # of rounds
verbose = 1, # Fit
print_every_n = 20 # Results every 20th iter
)
## [1] train-rmse:0.435761
## [21] train-rmse:0.015697
## [41] train-rmse:0.006346
## [61] train-rmse:0.003051
## [81] train-rmse:0.001486
## [100] train-rmse:0.000905
# Feature importance for private schools
importance_matrix_private <- xgb.importance( model = private_bst_mod_final)
# Print the feature importances
print(importance_matrix_private)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: rel_apply_sat 7.397168e-01 0.169789975 0.125720272
## 2: rel_attend 2.418911e-01 0.366571579 0.190413829
## 3: rel_att_cond_app 9.441161e-03 0.182131149 0.166579361
## 4: attend 1.203453e-03 0.026034666 0.071241488
## 5: stderr_rel_apply 9.455367e-04 0.021140367 0.044525930
## 6: rel_att_cond_app_sat 8.453134e-04 0.026792937 0.043740178
## 7: rel_attend_sat 8.332962e-04 0.019323760 0.041644840
## 8: stderr_rel_apply_sat 8.140242e-04 0.028844731 0.034311158
## 9: stderr_rel_attend_sat 7.718536e-04 0.017949139 0.029858565
## 10: stderr_attend 6.782930e-04 0.029485409 0.048192771
## 11: par_income_bin 6.489632e-04 0.040518869 0.058407543
## 12: attend_sat 5.875982e-04 0.018322192 0.038501833
## 13: stderr_rel_attend 5.327781e-04 0.019033832 0.033263489
## 14: attend_level 4.292144e-04 0.008855963 0.020167627
## 15: stderr_attend_sat 3.312067e-04 0.017097604 0.033263489
## 16: attend_level_sat 2.667013e-04 0.004667222 0.015191200
## 17: tier 6.269296e-05 0.003440606 0.004976427
# Plot of feature importance for PRIVATE
xgb.plot.importance(importance_matrix_private)
source("~/Downloads/a_insights_shap_functions.r")
# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = private_bst_mod_final,
X_train =as.matrix(xg_private_train_df[,c(2:8, 10:19)]),
shap_approx = F)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## make SHAP score by decreasing order
# Plot SHAP importance
var_importance(shap_result, top_n=10)
shap_long = shap.prep(shap = shap_result,
X_train = as.matrix(xg_private_train_df[,c(2:8, 10:19)]),
top_n = 10)
## Loading required package: ggforce
plot.shap.summary(data_long = shap_long)
# Set matrices for xgboost analysis for private schools
dtrain_private2 <- xgb.DMatrix(data = as.matrix(xg_private_train_df[,c(2:8, 17:19)]), label = as.numeric(xg_private_train_df$rel_apply) -1)
dtest_private2 <- xgb.DMatrix(data = as.matrix(xg_private_test_df[,c(2:8, 17:19)]), label = as.numeric(xg_private_test_df$rel_apply) - 1)
private_bst_mod_final2 <- xgboost(data = dtrain_private2, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [23:52:07] WARNING: src/learner.cc:767:
## Parameters: { "nfold" } are not used.
##
## [1] train-rmse:0.554705
## Will train until train_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.118128
## [41] train-rmse:0.054982
## [61] train-rmse:0.038942
## [81] train-rmse:0.029498
## [100] train-rmse:0.023104
# Feature importance for private schools
importance_matrix_private <- xgb.importance( model = private_bst_mod_final2)
# Print the feature importances
print(importance_matrix_private)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: par_income_bin 0.456318412 0.14352442 0.11560430
## 2: attend_sat 0.135190376 0.13729117 0.11286269
## 3: attend 0.122619651 0.14487323 0.16952250
## 4: stderr_rel_attend_sat 0.104271406 0.13397901 0.11286269
## 5: attend_level 0.100552104 0.09488949 0.11034955
## 6: rel_att_cond_app_sat 0.021067137 0.16861107 0.13068312
## 7: attend_level_sat 0.019964851 0.05287290 0.06945396
## 8: stderr_attend 0.019887394 0.05889865 0.08407585
## 9: stderr_attend_sat 0.011067454 0.04446879 0.06808316
## 10: tier 0.009061217 0.02059128 0.02650217
# Plot of feature importance for PRIVATE
xgb.plot.importance(importance_matrix_private)
source("~/Downloads/a_insights_shap_functions.r")
# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = private_bst_mod_final2,
X_train =as.matrix(xg_private_train_df[,c(2:8, 17:19)]),
shap_approx = F)
## make SHAP score by decreasing order
# Plot SHAP importance
var_importance(shap_result, top_n=10)
shap_long = shap.prep(shap = shap_result,
X_train = as.matrix(xg_private_train_df[,c(2:8, 17:19)]),
top_n = 10)
plot.shap.summary(data_long = shap_long)
# Set matrices for xgboost analysis for public schools
dtrain_public <- xgb.DMatrix(data = as.matrix(xg_public_train_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_train_df$rel_apply) -1)
dtest_public <- xgb.DMatrix(data = as.matrix(xg_public_test_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_test_df$rel_apply) - 1)
# Use xgb.cv to run cross-validation inside xgboost
# Repeat for public school df
set.seed(111111)
bst <- xgb.cv(data = dtrain_public, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
nrounds = 400, # Set number of rounds
early_stopping_rounds = 400, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [1] train-rmse:0.498638+0.008287 test-rmse:0.498787+0.034812
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 400 rounds.
##
## [21] train-rmse:0.074538+0.000989 test-rmse:0.106366+0.008186
## [41] train-rmse:0.016654+0.000267 test-rmse:0.074568+0.007089
## [61] train-rmse:0.006790+0.000123 test-rmse:0.071566+0.007040
## [81] train-rmse:0.003968+0.000058 test-rmse:0.070815+0.006861
## [101] train-rmse:0.002410+0.000070 test-rmse:0.070635+0.006827
## [121] train-rmse:0.001487+0.000104 test-rmse:0.070569+0.006789
## [141] train-rmse:0.000967+0.000096 test-rmse:0.070539+0.006788
## [161] train-rmse:0.000720+0.000024 test-rmse:0.070526+0.006779
## [181] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [201] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [221] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [241] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [261] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [281] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [301] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [321] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [341] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [361] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [381] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
## [400] train-rmse:0.000714+0.000026 test-rmse:0.070527+0.006780
set.seed(111111)
public_bst_1 <- xgboost(data = dtrain_public, # Set training data
nrounds = 100, # Set number of rounds
verbose = 1, # 1 - Prints out fit
print_every_n = 20 # Prints out result every 20th iteration
)
## [1] train-rmse:0.392911
## [21] train-rmse:0.007654
## [41] train-rmse:0.001731
## [61] train-rmse:0.000629
## [81] train-rmse:0.000629
## [100] train-rmse:0.000629
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst <- xgb.cv(data = dtrain_public, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [1] train-rmse:0.498638+0.008287 test-rmse:0.498787+0.034812
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.074538+0.000989 test-rmse:0.106366+0.008186
## [41] train-rmse:0.016654+0.000267 test-rmse:0.074568+0.007089
## [61] train-rmse:0.006790+0.000123 test-rmse:0.071566+0.007040
## [81] train-rmse:0.003968+0.000058 test-rmse:0.070815+0.006861
## [100] train-rmse:0.002485+0.000082 test-rmse:0.070653+0.006824
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_1 <- xgb.cv(data = dtrain_public, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.3, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [1] train-rmse:0.393587+0.006288 test-rmse:0.397344+0.028556
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.006762+0.000454 test-rmse:0.074262+0.005257
## [41] train-rmse:0.001389+0.000197 test-rmse:0.073524+0.005366
## [61] train-rmse:0.000625+0.000038 test-rmse:0.073471+0.005347
## [81] train-rmse:0.000625+0.000038 test-rmse:0.073471+0.005347
## [100] train-rmse:0.000625+0.000038 test-rmse:0.073471+0.005347
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_2 <- xgb.cv(data = dtrain_public, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [1] train-rmse:0.498638+0.008287 test-rmse:0.498787+0.034812
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.074538+0.000989 test-rmse:0.106366+0.008186
## [41] train-rmse:0.016654+0.000267 test-rmse:0.074568+0.007089
## [61] train-rmse:0.006790+0.000123 test-rmse:0.071566+0.007040
## [81] train-rmse:0.003968+0.000058 test-rmse:0.070815+0.006861
## [100] train-rmse:0.002485+0.000082 test-rmse:0.070653+0.006824
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_3 <- xgb.cv(data = dtrain_public, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.05, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [1] train-rmse:0.525009+0.008782 test-rmse:0.524401+0.036329
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.201132+0.002951 test-rmse:0.215460+0.016975
## [41] train-rmse:0.081311+0.001072 test-rmse:0.112250+0.007407
## [61] train-rmse:0.035773+0.000357 test-rmse:0.082799+0.006179
## [81] train-rmse:0.017891+0.000265 test-rmse:0.075423+0.007179
## [100] train-rmse:0.010705+0.000262 test-rmse:0.072971+0.007478
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_4 <- xgb.cv(data = dtrain_public, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.01, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [1] train-rmse:0.546130+0.009176 test-rmse:0.544945+0.037534
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.450830+0.007398 test-rmse:0.452225+0.032298
## [41] train-rmse:0.372723+0.005968 test-rmse:0.376796+0.027986
## [61] train-rmse:0.308653+0.004829 test-rmse:0.315404+0.024344
## [81] train-rmse:0.255994+0.003891 test-rmse:0.265864+0.020819
## [100] train-rmse:0.214652+0.003176 test-rmse:0.227938+0.017899
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_5 <- xgb.cv(data = dtrain_public, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.005, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [1] train-rmse:0.548771+0.009226 test-rmse:0.547516+0.037684
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.498612+0.008292 test-rmse:0.498663+0.034964
## [41] train-rmse:0.453205+0.007442 test-rmse:0.454545+0.032409
## [61] train-rmse:0.412086+0.006679 test-rmse:0.414768+0.030110
## [81] train-rmse:0.374844+0.006005 test-rmse:0.378823+0.028105
## [100] train-rmse:0.342718+0.005436 test-rmse:0.347876+0.026375
# Extract results for model with eta = 0.3
pd1 <- cbind.data.frame(public_bst_mod_1$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.3, nrow(public_bst_mod_1$evaluation_log)))
names(pd1)[3] <- "eta"
# Extract results for model with eta = 0.1
pd2 <- cbind.data.frame(public_bst_mod_2$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.1, nrow(public_bst_mod_2$evaluation_log)))
names(pd2)[3] <- "eta"
# Extract results for model with eta = 0.05
pd3 <- cbind.data.frame(public_bst_mod_3$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.05, nrow(public_bst_mod_3$evaluation_log)))
names(pd3)[3] <- "eta"
# Extract results for model with eta = 0.01
pd4 <- cbind.data.frame(public_bst_mod_4$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.01, nrow(public_bst_mod_4$evaluation_log)))
names(pd4)[3] <- "eta"
# Extract results for model with eta = 0.005
pd5 <- cbind.data.frame(public_bst_mod_5$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.005, nrow(public_bst_mod_5$evaluation_log)))
names(pd5)[3] <- "eta"
# Join datasets
plot_data <- rbind.data.frame(pd1, pd2, pd3, pd4, pd5)
# Convert ETA to factor
plot_data$eta <- as.factor(plot_data$eta)
# Plot points
g_4 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
geom_point(alpha = 0.5) +
theme_bw() + # Set theme
theme(panel.grid.major = element_blank(), # Remove grid
panel.grid.minor = element_blank(), # Remove grid
panel.border = element_blank(), # Remove grid
panel.background = element_blank()) + # Remove grid
labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
y = "Error Rate", color = "Learning \n Rate") # Set labels
g_4
# Plot lines
g_5 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
geom_smooth(alpha = 0.5) +
theme_bw() + # Set theme
theme(panel.grid.major = element_blank(), # Remove grid
panel.grid.minor = element_blank(), # Remove grid
panel.border = element_blank(), # Remove grid
panel.background = element_blank()) + # Remove grid
labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
y = "Error Rate", color = "Learning \n Rate")
g_5
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Plot lines
g_6 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
geom_smooth(alpha = 0.5) +
theme_bw() + # Set theme
theme(panel.grid.major = element_blank(), # Remove grid
panel.grid.minor = element_blank(), # Remove grid
panel.border = element_blank(), # Remove grid
panel.background = element_blank()) + # Remove grid
labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
y = "Error Rate", color = "Learning \n Rate") + # Set labels
xlim(0, 100) + # Zoom in on the x-axis (number of trees)
ylim(0, 0.1) # Zoom in on the y-axis (error rate)# Set labels
g_6
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 275 rows containing non-finite values (`stat_smooth()`).
public_bst_mod_final <- xgboost(data = dtrain_public, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [23:52:15] WARNING: src/learner.cc:767:
## Parameters: { "nfold" } are not used.
##
## [1] train-rmse:0.498477
## Will train until train_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.073745
## [41] train-rmse:0.016763
## [61] train-rmse:0.007070
## [81] train-rmse:0.004419
## [100] train-rmse:0.002912
# Set matrices for xgboost analysis for public schools
dtrain_public <- xgb.DMatrix(data = as.matrix(xg_public_train_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_train_df$rel_apply) -1)
dtest_public <- xgb.DMatrix(data = as.matrix(xg_public_test_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_test_df$rel_apply) - 1)
# Feature importance for private schools
importance_matrix_public <- xgb.importance( model = public_bst_mod_final)
# Print the feature importances
print(importance_matrix_public)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: rel_apply_sat 8.795855e-01 0.130952509 0.072534404
## 2: rel_attend 5.082076e-02 0.222682181 0.104357798
## 3: stderr_rel_apply_instate 1.271313e-02 0.014796146 0.013761468
## 4: rel_apply_oostate 1.108467e-02 0.058169339 0.045011468
## 5: par_income_bin 1.010362e-02 0.016243945 0.062500000
## 6: rel_att_cond_app 9.433339e-03 0.090582158 0.070814220
## 7: rel_apply_instate 4.819239e-03 0.030067743 0.036697248
## 8: attend_level_instate_sat 4.146900e-03 0.004379145 0.007167431
## 9: stderr_rel_apply 1.985254e-03 0.021999392 0.030676606
## 10: stderr_attend 1.569007e-03 0.009162243 0.030963303
## 11: stderr_rel_apply_sat 1.423387e-03 0.006524032 0.015768349
## 12: attend 1.080127e-03 0.014023987 0.049025229
## 13: stderr_rel_apply_oostate_sat 8.877320e-04 0.011053318 0.010894495
## 14: attend_sat 8.743783e-04 0.009559047 0.031250000
## 15: stderr_rel_attend_oostate_sat 8.553265e-04 0.010642215 0.010321101
## 16: rel_att_cond_app_instate 7.889163e-04 0.013205355 0.015194954
## 17: attend_oostate 6.757113e-04 0.014066885 0.013474771
## 18: stderr_attend_instate_sat 6.683678e-04 0.030643287 0.017775229
## 19: rel_apply_oostate_sat 6.502065e-04 0.030107066 0.019782110
## 20: attend_level 5.358668e-04 0.005837668 0.012327982
## 21: attend_instate 4.874149e-04 0.009087172 0.018061927
## 22: stderr_attend_sat 3.853706e-04 0.010499222 0.017775229
## 23: rel_att_cond_app_oostate 3.491389e-04 0.009980875 0.015481651
## 24: rel_att_cond_app_sat 3.384530e-04 0.015328793 0.021502294
## 25: rel_apply_instate_sat 3.328558e-04 0.008154146 0.015481651
## 26: rel_att_cond_app_instate_sat 3.031087e-04 0.008954904 0.011467890
## 27: stderr_rel_attend_sat 2.889052e-04 0.006795717 0.010894495
## 28: rel_attend_sat 2.520679e-04 0.019736536 0.026376147
## 29: attend_level_sat 2.235196e-04 0.003903695 0.008314220
## 30: stderr_rel_attend_instate 1.854963e-04 0.008343611 0.008600917
## 31: stderr_attend_oostate 1.822433e-04 0.009140794 0.013474771
## 32: stderr_attend_instate 1.715580e-04 0.012912220 0.013188073
## 33: stderr_rel_apply_oostate 1.661473e-04 0.007117450 0.008314220
## 34: stderr_rel_attend_oostate 1.613415e-04 0.002906322 0.006594037
## 35: rel_att_cond_app_oostate_sat 1.588805e-04 0.006441811 0.012901376
## 36: rel_attend_instate 1.580274e-04 0.012286628 0.010894495
## 37: stderr_rel_attend_instate_sat 1.543750e-04 0.003335299 0.005447248
## 38: stderr_rel_apply_instate_sat 1.498414e-04 0.015418164 0.011467890
## 39: stderr_attend_oostate_sat 1.497366e-04 0.007088852 0.012901376
## 40: attend_level_instate 1.261494e-04 0.004336247 0.004300459
## 41: attend_oostate_sat 1.147398e-04 0.004257601 0.009747706
## 42: rel_attend_oostate_sat 1.080170e-04 0.007160348 0.012614679
## 43: rel_attend_instate_sat 8.997062e-05 0.006992332 0.012614679
## 44: attend_level_oostate_sat 7.282942e-05 0.006255921 0.004013761
## 45: attend_level_oostate 6.523056e-05 0.007471357 0.004587156
## 46: stderr_rel_attend 4.904695e-05 0.004071711 0.008887615
## 47: attend_instate_sat 4.726513e-05 0.010610042 0.010321101
## 48: rel_attend_oostate 2.448261e-05 0.019354031 0.011467890
## 49: tier 2.394083e-06 0.007360538 0.002006881
## Feature Gain Cover Frequency
# Plot of feature importance for PRIVATE
xgb.plot.importance(importance_matrix_public)
From this graph, I’m taking everything from rel_apply_instate below and
printing the bottom 10 below that when it comes to the next section,
“Detailed Features”
source("~/Downloads/a_insights_shap_functions.r")
# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = public_bst_mod_final,
X_train =as.matrix(xg_public_train_df[,c(2:8, 10:51)]),
shap_approx = F)
## make SHAP score by decreasing order
# Plot SHAP importance
var_importance(shap_result, top_n=10)
shap_long = shap.prep(shap = shap_result,
X_train = as.matrix(xg_public_train_df[,c(2:8, 10:51)]),
top_n = 10)
plot.shap.summary(data_long = shap_long)
# Check feature names in the model
model_feature_names <- colnames(public_bst_mod_final$data)
# Check feature names in the new data
data_feature_names <- colnames(xg_public_train_df[,c(2:8, 10:51)])
# Compare
setdiff(model_feature_names, data_feature_names) # Features in the model but not in the new data
## NULL
setdiff(data_feature_names, model_feature_names) # Features in the new data but not in the model
## [1] "par_income_bin" "attend"
## [3] "stderr_attend" "attend_level"
## [5] "attend_sat" "stderr_attend_sat"
## [7] "attend_level_sat" "stderr_rel_apply"
## [9] "rel_attend" "stderr_rel_attend"
## [11] "rel_att_cond_app" "rel_apply_sat"
## [13] "stderr_rel_apply_sat" "rel_attend_sat"
## [15] "stderr_rel_attend_sat" "rel_att_cond_app_sat"
## [17] "attend_instate" "stderr_attend_instate"
## [19] "attend_level_instate" "attend_instate_sat"
## [21] "stderr_attend_instate_sat" "attend_level_instate_sat"
## [23] "attend_oostate" "stderr_attend_oostate"
## [25] "attend_level_oostate" "attend_oostate_sat"
## [27] "stderr_attend_oostate_sat" "attend_level_oostate_sat"
## [29] "rel_apply_instate" "stderr_rel_apply_instate"
## [31] "rel_attend_instate" "stderr_rel_attend_instate"
## [33] "rel_att_cond_app_instate" "rel_apply_oostate"
## [35] "stderr_rel_apply_oostate" "rel_attend_oostate"
## [37] "stderr_rel_attend_oostate" "rel_att_cond_app_oostate"
## [39] "rel_apply_instate_sat" "stderr_rel_apply_instate_sat"
## [41] "rel_attend_instate_sat" "stderr_rel_attend_instate_sat"
## [43] "rel_att_cond_app_instate_sat" "rel_apply_oostate_sat"
## [45] "stderr_rel_apply_oostate_sat" "rel_attend_oostate_sat"
## [47] "stderr_rel_attend_oostate_sat" "rel_att_cond_app_oostate_sat"
## [49] "tier"
# Set matrices for xgboost analysis for public schools
dtrain_public2 <- xgb.DMatrix(data = as.matrix(xg_public_train_df[, c(6, 32, 10, 37, 47, 7, 17, 27, 22, 42)]), label = as.numeric(xg_public_train_df$rel_apply) -1)
dtest_public2 <- xgb.DMatrix(data = as.matrix(xg_public_test_df[, c(6, 32, 10, 37, 47, 7, 17, 27, 22, 42)]), label = as.numeric(xg_public_test_df$rel_apply) -1)
public_bst_mod_final2 <- xgboost(data = dtrain_public2, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
nrounds = 100, # Set number of rounds
early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20, # Prints out result every 20th iteration
objective = "reg:squarederror", # Change this to regression
eval_metric = "rmse")
## [23:52:16] WARNING: src/learner.cc:767:
## Parameters: { "nfold" } are not used.
##
## [1] train-rmse:0.502742
## Will train until train_rmse hasn't improved in 100 rounds.
##
## [21] train-rmse:0.106989
## [41] train-rmse:0.048002
## [61] train-rmse:0.032751
## [81] train-rmse:0.024787
## [100] train-rmse:0.019116
# Feature importance for public schools
importance_matrix_public <- xgb.importance( model = public_bst_mod_final2)
# Print the feature importances
print(importance_matrix_public)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: stderr_rel_attend_sat 0.45761309 0.17337198 0.10364842
## 2: attend_sat 0.20058436 0.15060523 0.20370370
## 3: stderr_rel_apply 0.07623884 0.10366050 0.11083472
## 4: stderr_rel_apply_oostate_sat 0.06267240 0.06919137 0.06688778
## 5: stderr_rel_apply_oostate 0.04962404 0.04335253 0.08761747
## 6: stderr_rel_apply_instate 0.04205569 0.12570604 0.12907684
## 7: stderr_attend_sat 0.03500050 0.10029369 0.08430072
## 8: attend_instate_sat 0.03289434 0.11217983 0.09894970
## 9: stderr_rel_apply_instate_sat 0.03181897 0.08745401 0.07877280
## 10: attend_level_oostate 0.01149776 0.03418480 0.03620785
# Plot of feature importance for PRIVATE
xgb.plot.importance(importance_matrix_public)
source("~/Downloads/a_insights_shap_functions.r")
# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = public_bst_mod_final2,
X_train =as.matrix(xg_public_train_df[,c(6, 32, 10, 37, 47, 7, 17, 27, 22, 42)]),
shap_approx = F)
## make SHAP score by decreasing order
# Plot SHAP importance
var_importance(shap_result, top_n=10)
shap_long = shap.prep(shap = shap_result,
X_train = as.matrix(xg_public_train_df[,c(6, 32, 10, 37, 47, 7, 17, 27, 22, 42)]),
top_n = 10)
plot.shap.summary(data_long = shap_long)
# Private_bst_1 is our trained xgb model for priv school data, predict values for RMSE calculations
preds_private <- predict(private_bst_mod_final, dtest_private)
# Actual values are xg_private_test_df
actual_private <- as.numeric(xg_private_test_df$rel_apply) - 1
# Calculate RMSE
rmse_private <- rmse(actual_private, preds_private)
# Print RMSE
print(paste("Private Model RMSE:", rmse_private))
## [1] "Private Model RMSE: 0.0837600179618872"
# Public_bst_1 is our trained xgb model for public school data, predict values for RMSE calculations
preds_public <- predict(public_bst_mod_final, dtest_public)
# Actual values are xg_public_test_df
actual_public <- as.numeric(xg_public_test_df$rel_apply) - 1
# RMSE for public schools
rmse_public <- rmse(actual_public, preds_public)
# Print RMSE
print(paste("Public Model RMSE:", rmse_public))
## [1] "Public Model RMSE: 0.0830721303856991"
# Actual vs. predicted visualizations for private
plot(actual_private, preds_private, main = "Private Model: Predicted vs Actual",
xlab = "Actual Values", ylab = "Predicted Values",
col="blue", pch=19)
abline(0, 1, col="red") # 45 deg. reference line
# Plot actual vs. predicted for public model
plot(actual_public, preds_public, main="Public Model: Predicted vs Actual",
xlab="Actual Values", ylab="Predicted Values",
col="blue", pch=19)
abline(0, 1, col="red") # 45 degree reference line